#模型地址：https://modelscope.cn/models/sogagaga/GLM-4.5-Air-w4a8

export VLLM_W8A8_MOE_USE_W4A8=1
vllm serve /data/ZhipuAI/GLM-4.5-Air-w4a8 -tp 4 --port 12345 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'

# curl http://127.0.0.1:12345/v1/chat/completions \
#   -H "Content-Type: application/json" \
#   -d '{
#     "model": "/data/ZhipuAI/GLM-4.5-Air-w4a8",
#     "messages": [
#       {
#         "role": "user",
#         "content": "请写一首描写黄河的七言古诗，只输出诗，不要任何解释。"
#       }
#     ],
#     "chat_template_kwargs": {"enable_thinking": true},
#     "temperature": 0.6
#   }'